notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import homogeneity_score

from scipy.cluster.hierarchy import linkage, dendrogram

np.set_printoptions(suppress=True, precision=5)


%matplotlib inline



In [2]:

    
X, y = make_blobs(n_samples = 150, n_features=2, 
                  centers=3, cluster_std=0.5, shuffle=True, random_state=0)



In [3]:

    
plt.scatter(X[:, 0], X[:, 1], c = "steelblue", marker = "o", s = 50)
plt.xlabel("X1")
plt.ylabel("X2")









    Out[3]:





Text(0, 0.5, 'X2')



In [4]:

    
km = KMeans(n_clusters=3, init="random", n_init = 10, 
            max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)



In [5]:

    
def show_cluster(X, y, estimator = None, ignore_noise = True):
    levels = set(y)
    
    if ignore_noise and -1 in levels:
        levels.remove(-1)
    
    colors = sns.color_palette("husl", len(levels))
    centroids = None 
    if estimator is not None and hasattr(estimator, "cluster_centers_"):
        centroids = estimator.cluster_centers_  

    for k in levels:
        data = X[y == k, :]
        plt.scatter(data[:, 0], data[:, 1], color = colors[k], s = 50, label = "Cluster %s" % k)

    if not centroids is None:
        plt.scatter(centroids[:, 0], centroids[:, 1], color = "black", marker = "*", s = 150)

    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.legend(loc = "lower left")
    
show_cluster(X, y_km, km)



In [6]:

    
km.cluster_centers_









    Out[6]:





array([[ 0.93297,  4.35421],
       [ 2.06522,  0.96137],
       [-1.59473,  2.92237]])



In [7]:

    
#Sum of distances of samples to their closest cluster center.
print("Distortion (Within Cluster SSE): %.2f" % km.inertia_)









    



Distortion (Within Cluster SSE): 72.48



In [8]:

    
#Sum of distances of samples to their closest cluster center.
homogeneity_score(y, y_km)









    Out[8]:





1.0



In [9]:

    
X, y = make_blobs(n_samples = 150, n_features=2, centers=3, 
                  cluster_std=1.0, shuffle=True, random_state=0)
km = KMeans(n_clusters=3, init="random", n_init = 10, 
            max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)
print("Homogeneity score: ", homogeneity_score(y, y_km), "Inertia: ", km.inertia_)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y, km)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_km, km)
plt.title("Estimated clusters")









    



Homogeneity score:  0.6812190697089529 Inertia:  262.72046565264066






    Out[9]:





Text(0.5, 1.0, 'Estimated clusters')

Find optimal number of clusters using elbow method



In [10]:

    
def find_elbow(X, n = 10):
    distortions = []
    for i in range(1, n):
        km = KMeans(n_clusters=i, max_iter=300, n_init=10, random_state=0, init="k-means++")
        km.fit(X)
        distortions.append(km.inertia_)
    plt.plot(range(1, n), distortions)
    plt.xlabel("Number of clusters (K)")
    plt.ylabel("Distortion")

find_elbow(X)

Find number of clusters from Dendogram



In [11]:

    
plt.figure(figsize = (15, 10))
row_clusters = linkage(X, method="complete", metric="euclidean")
f = dendrogram(row_clusters)

Half Moon Dataset and DBSCAN



In [12]:

    
from sklearn.datasets import make_moons



In [13]:

    
X, y = make_moons(n_samples=200, noise=0.09, random_state=0)



In [14]:

    
plt.scatter(X[:, 0], X[:, 1], c = "steelblue", marker = "o", s = 50)
plt.xlabel("X1")
plt.ylabel("X2")









    Out[14]:





Text(0, 0.5, 'X2')



In [15]:

    
km = KMeans(n_clusters=2, init="random", n_init = 10, max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)
#show_cluster(km, X, y_km)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_km, km)
plt.title("Estimated clusters")









    Out[15]:





Text(0.5, 1.0, 'Estimated clusters')



In [16]:

    
homogeneity_score(y, y_km)









    Out[16]:





0.1810982320182603



In [17]:

    
dbscan = DBSCAN(eps=0.2, min_samples=10, metric="euclidean")
y_db = dbscan.fit_predict(X)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y, dbscan)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_db, dbscan)
plt.title("Estimated clusters")









    Out[17]:





Text(0.5, 1.0, 'Estimated clusters')



In [18]:

    
labels = set(y_db)
if -1 in labels: #Noise
    labels.remove(-1)
print("No of clusters: ", len(labels))









    



No of clusters:  7



In [19]:

    
homogeneity_score(y, y_db)









    Out[19]:





0.9395815602003369

Applying clustering to grouplens movies dataset based on genre



In [20]:

    
movies = pd.read_csv("/data/movielens/movies.csv", index_col="movieId")
movies.head()









    Out[20]:







  
    
      
      title
      genres
    
    
      movieId
      
      
    
  
  
    
      1
      Toy Story (1995)
      Adventure|Animation|Children|Comedy|Fantasy
    
    
      2
      Jumanji (1995)
      Adventure|Children|Fantasy
    
    
      3
      Grumpier Old Men (1995)
      Comedy|Romance
    
    
      4
      Waiting to Exhale (1995)
      Comedy|Drama|Romance
    
    
      5
      Father of the Bride Part II (1995)
      Comedy



In [21]:

    
movies.sample(10)









    Out[21]:







  
    
      
      title
      genres
    
    
      movieId
      
      
    
  
  
    
      2202
      Lifeboat (1944)
      Drama|War
    
    
      991
      Michael Collins (1996)
      Drama
    
    
      4863
      Female Trouble (1975)
      Comedy|Crime
    
    
      647
      Courage Under Fire (1996)
      Action|Crime|Drama|War
    
    
      85788
      Insidious (2010)
      Fantasy|Horror|Thriller
    
    
      963
      Inspector General, The (1949)
      Musical
    
    
      4570
      Big Picture, The (1989)
      Comedy|Drama
    
    
      55999
      Mr. Magorium's Wonder Emporium (2007)
      Children|Comedy|Fantasy
    
    
      640
      Diabolique (1996)
      Drama|Thriller
    
    
      1960
      Last Emperor, The (1987)
      Drama



In [22]:

    
movies = movies[~movies["genres"].str.contains("\(no genres listed\)")]
movies.sample(10)









    Out[22]:







  
    
      
      title
      genres
    
    
      movieId
      
      
    
  
  
    
      3979
      Little Nicky (2000)
      Comedy
    
    
      5223
      Pauline & Paulette (Pauline en Paulette) (2001)
      Comedy|Drama
    
    
      51094
      Gray Matters (2006)
      Comedy|Drama|Romance
    
    
      2828
      Dudley Do-Right (1999)
      Children|Comedy
    
    
      80083
      Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Or...
      Action|Adventure|Animation|Fantasy|Sci-Fi
    
    
      113640
      Canal, The (2014)
      Horror|Thriller
    
    
      6299
      Winged Migration (Peuple migrateur, Le) (2001)
      Documentary
    
    
      1192
      Paris Is Burning (1990)
      Documentary
    
    
      68901
      Chop Shop (2007)
      Drama
    
    
      3032
      Omega Man, The (1971)
      Action|Drama|Sci-Fi|Thriller



In [23]:

    
genres = set()
movies["genres"].apply(lambda g: genres.update(g.split(r"|")))
genres = list(genres)
genres.sort()
print(genres, len(genres))









    



['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] 19



In [24]:

    
def to_vector(g):
    indices = np.array([genres.index(v) for v in g.split(r"|")])
    l = np.zeros(len(genres))
    l[indices] = 1
    return l

genres_idx = movies["genres"].apply(to_vector)
genres_idx.head(10)









    Out[24]:





movieId
1     [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...
2     [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
3     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...
5     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
6     [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
7     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
8     [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9     [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10    [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: genres, dtype: object



In [25]:

    
X = np.array(genres_idx.tolist())
print("X.shape: ", X.shape)









    



X.shape:  (9107, 19)

Normalize the data



In [26]:

    
scaler = StandardScaler()
X_std = scaler.fit_transform(X)



In [27]:

    
plt.figure(figsize = (15, 10))
row_clusters = linkage(X_std, method="complete", metric="euclidean")
f = dendrogram(row_clusters, p = 5, truncate_mode="level")

To visualize the clusters lets apply PCA with 2 components.



In [28]:

    
from sklearn.decomposition import KernelPCA, PCA



In [29]:

    
pca = PCA(random_state=0)
X_pca = pca.fit_transform(X_std)

ratios = pca.explained_variance_ratio_
plt.bar(range(len(ratios)), ratios)
plt.step(range(len(ratios)), np.cumsum(ratios), 
         label = "Cumsum of Explained variance ratio")
plt.title("Explained variance")
plt.ylabel("Explained Variance Ratio")
plt.xlabel("Number of PCA components")









    Out[29]:





Text(0.5, 0, 'Number of PCA components')

With 2 principle components havelow explained variance coverage.



In [30]:

    
pca = PCA(random_state=0, n_components=2)
X_pca = pca.fit_transform(X_std)

plt.figure(figsize = (15, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel("PCA1")
plt.ylabel("PCA2")









    Out[30]:





Text(0, 0.5, 'PCA2')

There is not visual indication of clusters from 2 PCA components which is consistent with the finding that explained variance with 2 components is only 2%



In [31]:

    
find_elbow(X_std, 40)



In [32]:

    
knn = KMeans(n_clusters=8, max_iter=300, random_state=0)
y_pred = knn.fit_predict(X_std)

For each observations, compute distance to the nearest cluster centroid.



In [33]:

    
def distance(p1, p2):
    p1, p2 = p1.flatten(), p2.flatten()
    return np.sqrt(np.sum((p1 - p2) ** 2))

distances = []
for i in range(X_std.shape[0]):
    p1 = X_std[i, :]
    cluster = knn.labels_[i]
    center = knn.cluster_centers_[cluster]
    distances.append(distance(p1, center))

movies["distance"] = np.array(distances)
movies.sort_values("distance", ascending=False)[:10]









    Out[33]:







  
    
      
      title
      genres
      distance
    
    
      movieId
      
      
      
    
  
  
    
      81132
      Rubber (2010)
      Action|Adventure|Comedy|Crime|Drama|Film-Noir|...
      10.008449
    
    
      81847
      Tangled (2010)
      Animation|Children|Comedy|Fantasy|Musical|Roma...
      8.401892
    
    
      83613
      Cowboys & Aliens (2011)
      Action|Sci-Fi|Thriller|Western|IMAX
      8.313656
    
    
      595
      Beauty and the Beast (1991)
      Animation|Children|Fantasy|Musical|Romance|IMAX
      8.223587
    
    
      2142
      American Tail: Fievel Goes West, An (1991)
      Adventure|Animation|Children|Musical|Western
      8.047143
    
    
      103384
      Lone Ranger, The (2013)
      Action|Adventure|Western|IMAX
      7.930896
    
    
      7374
      Home on the Range (2004)
      Animation|Children|Comedy|Musical|Western
      7.822317
    
    
      364
      Lion King, The (1994)
      Adventure|Animation|Children|Drama|Musical|IMAX
      7.558606
    
    
      3159
      Fantasia 2000 (1999)
      Animation|Children|Musical|IMAX
      7.416244
    
    
      26701
      Patlabor: The Movie (Kidô keisatsu patorebâ: T...
      Action|Animation|Crime|Drama|Film-Noir|Mystery...
      7.396372



In [34]:

    
movies[y_pred == 3].sample(10)









    Out[34]:







  
    
      
      title
      genres
      distance
    
    
      movieId
      
      
      
    
  
  
    
      2367
      King Kong (1976)
      Adventure|Fantasy|Romance|Sci-Fi|Thriller
      6.439895
    
    
      156607
      The Huntsman Winter's War (2016)
      Action|Adventure|Drama|Fantasy
      5.160459
    
    
      51698
      Last Mimzy, The (2007)
      Adventure|Children|Sci-Fi
      5.026697
    
    
      104419
      Justice League: Crisis on Two Earths (2010)
      Action|Animation|Sci-Fi
      5.577885
    
    
      8
      Tom and Huck (1995)
      Adventure|Children
      4.047729
    
    
      30810
      Life Aquatic with Steve Zissou, The (2004)
      Adventure|Comedy|Fantasy
      4.594276
    
    
      99764
      It's Such a Beautiful Day (2012)
      Animation|Comedy|Drama|Fantasy|Sci-Fi
      5.681293
    
    
      3287
      Tigger Movie, The (2000)
      Animation|Children
      3.966153
    
    
      95311
      Presto (2008)
      Animation|Children|Comedy|Fantasy
      4.163322
    
    
      919
      Wizard of Oz, The (1939)
      Adventure|Children|Fantasy|Musical
      5.874266



In [ ]:

	title	genres
movieId
1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
2	Jumanji (1995)	Adventure\|Children\|Fantasy
3	Grumpier Old Men (1995)	Comedy\|Romance
4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
5	Father of the Bride Part II (1995)	Comedy

	title	genres
movieId
2202	Lifeboat (1944)	Drama\|War
991	Michael Collins (1996)	Drama
4863	Female Trouble (1975)	Comedy\|Crime
647	Courage Under Fire (1996)	Action\|Crime\|Drama\|War
85788	Insidious (2010)	Fantasy\|Horror\|Thriller
963	Inspector General, The (1949)	Musical
4570	Big Picture, The (1989)	Comedy\|Drama
55999	Mr. Magorium's Wonder Emporium (2007)	Children\|Comedy\|Fantasy
640	Diabolique (1996)	Drama\|Thriller
1960	Last Emperor, The (1987)	Drama

	title	genres
movieId
3979	Little Nicky (2000)	Comedy
5223	Pauline & Paulette (Pauline en Paulette) (2001)	Comedy\|Drama
51094	Gray Matters (2006)	Comedy\|Drama\|Romance
2828	Dudley Do-Right (1999)	Children\|Comedy
80083	Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Or...	Action\|Adventure\|Animation\|Fantasy\|Sci-Fi
113640	Canal, The (2014)	Horror\|Thriller
6299	Winged Migration (Peuple migrateur, Le) (2001)	Documentary
1192	Paris Is Burning (1990)	Documentary
68901	Chop Shop (2007)	Drama
3032	Omega Man, The (1971)	Action\|Drama\|Sci-Fi\|Thriller

	title	genres	distance
movieId
81132	Rubber (2010)	Action\|Adventure\|Comedy\|Crime\|Drama\|Film-Noir\|...	10.008449
81847	Tangled (2010)	Animation\|Children\|Comedy\|Fantasy\|Musical\|Roma...	8.401892
83613	Cowboys & Aliens (2011)	Action\|Sci-Fi\|Thriller\|Western\|IMAX	8.313656
595	Beauty and the Beast (1991)	Animation\|Children\|Fantasy\|Musical\|Romance\|IMAX	8.223587
2142	American Tail: Fievel Goes West, An (1991)	Adventure\|Animation\|Children\|Musical\|Western	8.047143
103384	Lone Ranger, The (2013)	Action\|Adventure\|Western\|IMAX	7.930896
7374	Home on the Range (2004)	Animation\|Children\|Comedy\|Musical\|Western	7.822317
364	Lion King, The (1994)	Adventure\|Animation\|Children\|Drama\|Musical\|IMAX	7.558606
3159	Fantasia 2000 (1999)	Animation\|Children\|Musical\|IMAX	7.416244
26701	Patlabor: The Movie (Kidô keisatsu patorebâ: T...	Action\|Animation\|Crime\|Drama\|Film-Noir\|Mystery...	7.396372

	title	genres	distance
movieId
2367	King Kong (1976)	Adventure\|Fantasy\|Romance\|Sci-Fi\|Thriller	6.439895
156607	The Huntsman Winter's War (2016)	Action\|Adventure\|Drama\|Fantasy	5.160459
51698	Last Mimzy, The (2007)	Adventure\|Children\|Sci-Fi	5.026697
104419	Justice League: Crisis on Two Earths (2010)	Action\|Animation\|Sci-Fi	5.577885
8	Tom and Huck (1995)	Adventure\|Children	4.047729
30810	Life Aquatic with Steve Zissou, The (2004)	Adventure\|Comedy\|Fantasy	4.594276
99764	It's Such a Beautiful Day (2012)	Animation\|Comedy\|Drama\|Fantasy\|Sci-Fi	5.681293
3287	Tigger Movie, The (2000)	Animation\|Children	3.966153
95311	Presto (2008)	Animation\|Children\|Comedy\|Fantasy	4.163322
919	Wizard of Oz, The (1939)	Adventure\|Children\|Fantasy\|Musical	5.874266